#Loading the libraries
!pip install matplotlib.backends.backend_pdf
import pandas as pd #For general purpose Data Manipulation
import numpy as np #For some operations - creating vectors and matrices/log transformation
from matplotlib.backends.backend_pdf import PdfPages #Plotting Box-Whisker
from matplotlib import pyplot as plt #Plotting Box-Whisker --> Outlier Detection
import seaborn as sns #Plotting Box-Whisker and others
from sklearn.model_selection import train_test_split #For Building Train and Test Set
from sklearn.linear_model import LinearRegression #For Regression Analysis
from sklearn.preprocessing import LabelEncoder #for the columns who will suffer from curse of dimensionality
from sklearn.preprocessing import OneHotEncoder #with less number of columns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
train=pd.read_csv("D:/Assignment1_PGD B7/Train.csv")
train.head()
X=train.iloc[:,:-1]
Y=train["Item_Outlet_Sales"]
train.head()
Y.head()
Y.shape
#Calculating the shape of original train data set
train.shape
#Calculating the shape of X
X.shape
#Calulating the shape of Y
Y.shape
!pip install pandas_profiling
# Creating a new dataset with only numeric values
numeric_var_names=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_names=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object', 'O']]
print(numeric_var_names)
print(cat_var_names)
train_num=train[numeric_var_names]
train_num.head(5)
train_cat=train[cat_var_names]
train_cat.head(5)
# Getting Quick Summary
def quick_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
num_summary=train_num.apply(lambda x: quick_summary(x))
num_summary
train_num.describe()
bp = PdfPages('x.pdf')
for num_variable in numeric_var_names:
fig,axes = plt.subplots(figsize=(10,4))
sns.boxplot( x=num_variable, data = train_num)
plt.title(str('Box Plot of ') + str(num_variable))
bp.savefig(fig)
bp.close()
train.skew()
"""This skewness of data needs to be removed by applying transformations"""
import numpy as np
train['Item_Weight'] = np.log(train['Item_Weight'])
train['Item_Weight'] = np.sqrt(train['Item_Weight'])
train['Item_Visibility'] = np.sqrt(train['Item_Visibility'])
train['Item_MRP'] = np.sqrt(train['Item_MRP'])
train['Outlet_Establishment_Year'] = np.square(train['Outlet_Establishment_Year'])
train['Item_Outlet_Sales'] = np.sqrt(train['Item_Outlet_Sales'])
train.skew()
train.shape
"""Skewness is removed after treatment of skewness. Skewness can cause outliers thus it is important that outliers must get
removed"""
def outlier_capping(x):
ul = x.quantile(0.75)+1.5*(x.quantile(0.75)-x.quantile(0.25))
ll = x.quantile(0.25)-1.5*(x.quantile(0.75)-x.quantile(0.25))
x = x.clip_upper(ul)
x = x.clip_lower(ll)
return x
train_num=train_num.apply(lambda x: outlier_capping(x))
train_num.describe()
train.update(train_num)
train.head()
"""Outliers in X and Y both have been removed."""
x = train[["Item_Weight","Item_Visibility"]].values
plt.boxplot(x)
plt.show()
y=train[["Item_MRP","Outlet_Establishment_Year","Item_Outlet_Sales"]].values
plt.boxplot(y)
plt.show()
import pandas_profiling as pd_prof
pd_prof.ProfileReport(train)
pd_prof.ProfileReport(train_cat)
pd_prof.ProfileReport(train_num)
train.info()
X.update(train)
X.info()
train.describe()
'''missing value imputation of numerical column "Item_Weight by mean"'''
mean=np.mean(train["Item_Weight"])
train["Item_Weight"].fillna(mean,inplace=True)
train.head()
'''missing value imputation of categorical column "Item_Size" by mode'''
train["Outlet_Size"].mode()
train["Outlet_Size"].fillna("Medium",inplace=True)
train["Outlet_Size"].head()
train.head()
X=train.iloc[:,:-1]
Y=train["Item_Outlet_Sales"]
X.drop(["Item_Identifier"],axis=1)
x_dummies=pd.get_dummies(X[["Item_Type","Outlet_Identifier"]]) #Use of get_dummies instead of One-Hot Encoding
x_dummies.head()
le=LabelEncoder()
X["Outlet_Size"]=le.fit_transform(X["Outlet_Size"])
X["Outlet_Location_Type"]=le.fit_transform(X["Outlet_Location_Type"])
X["Outlet_Type"]=le.fit_transform(X["Outlet_Type"])
X["Item_Fat_Content"]=le.fit_transform(X["Item_Fat_Content"])
x_dummies=pd.get_dummies(X)
x_dummies.head()
x_dummies.skew()
# Lets check if the Item_Weight and Item_Visibility is normally distributed
# Lets first check overall Item_Weight and Item_Visibility
# Item_Weight distribution
from matplotlib import pyplot as plt
plt.hist(x_dummies['Item_Weight'])
plt.xlabel('Item_Weight')
plt.ylabel('Number of Items')
plt.show()
plt.hist(x_dummies['Item_Visibility'])
plt.xlabel('Item_Visibility')
plt.ylabel('Number of Items')
plt.show()
plt.hist(x_dummies['Item_MRP'])
plt.xlabel('Item_MRP')
plt.ylabel('Number of Items')
plt.show()
plt.hist(x_dummies['Outlet_Establishment_Year'])
plt.xlabel('Outlet_Establishment_Year')
plt.ylabel('Number of Items')
plt.show()
Analysis that can be performed :
plt.scatter(x_dummies["Item_Weight"],x_dummies["Item_Visibility"],color='violet')
plt.xlabel("Item_Weight")
plt.ylabel("Item_Visibility")
plt.title("Item_Weight Vs Item_Visibility")
plt.show()
plt.scatter(x_dummies["Item_MRP"],x_dummies["Outlet_Establishment_Year"],color='orange')
plt.xlabel("Item_MRP")
plt.ylabel("Outlet_Establishment_Year")
plt.title("Item_MRP Vs Outlet_Establishment_Year")
plt.show()
matrix=x_dummies.corr()
matrix
plt.rcParams['figure.figsize'] = [10,10]
from matplotlib import pyplot as plt
plt.matshow(matrix,cmap='Blues')
plt.show()
import seaborn as sns
sns.stripplot(x_dummies['Item_Weight'],x_dummies['Item_MRP'],jitter=0.3,size=4)
plt.show()
x_dummies['Item_Weight'] = np.log(x_dummies['Item_Weight'])
x_dummies['Item_Weight'] = np.sqrt(x_dummies['Item_Weight'])
x_dummies['Item_Visibility'] = np.sqrt(x_dummies['Item_Visibility'])
x_dummies['Item_MRP'] = np.sqrt(x_dummies['Item_MRP'])
x_dummies['Outlet_Establishment_Year'] = np.square(x_dummies['Outlet_Establishment_Year'])
x_dummies['Item_Outlet_Sales'] = np.sqrt(x_dummies['Item_Outlet_Sales'])
train.skew()
train.shape
sns.pairplot(train)
plt.show()
import seaborn as sns
from matplotlib import pyplot as plt
g = sns.FacetGrid(train,col="Outlet_Size")
g = g.map(plt.hist,"Outlet_Establishment_Year")
plt.show()
import seaborn as sns
from matplotlib import pyplot as plt
g = sns.FacetGrid(train,col="Item_Fat_Content")
g = g.map(plt.hist,"Item_Weight")
plt.show()
import seaborn as sns
from matplotlib import pyplot as plt
g = sns.FacetGrid(train,col="Outlet_Size")
g = g.map(plt.hist,"Outlet_Establishment_Year")
plt.show()
# Min-Max Scaler , Normalizing X"""
from sklearn.preprocessing import StandardScaler
sc=MinMaxScaler()
sc_data=sc.fit_transform(x_dummies)
sc_data=pd.DataFrame(sc_data)
sc_data.columns=x_dummies.columns
sc_data.head()
Y.head()
"""Importing libraries and splitting the data"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
x_train,x_test,y_train,y_test=train_test_split(x_dummies,Y,test_size=0.2,random_state=0)
reg = LinearRegression()
model1=reg.fit(x_train,y_train)
len(x_train)
len(y_train)
"""Predicting Y_pred using x_test"""
Y_pred = reg.predict(x_test)
Y_pred
len(Y_pred)
from sklearn.metrics import r2_score, mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test,Y_pred))
rmse
r2= r2_score(y_test,Y_pred) # Built-in function score() indicates R-Square value
print(r2*100)
pd.set_option("display.float_format",lambda x: "%.3f" % x)
"""Plotting Residual Plots"""
x_axis = [i for i in range(1,len(Y_pred)+1)]
x_plot = plt.scatter(x_axis,(y_test-Y_pred),c='green')
plt.plot(x_axis,[0]*len(Y_pred),c='yellow')
plt.title('Residual Plot')
x_dummies.head()
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import Ridge, Lasso
x_train,x_test,y_train,y_test=train_test_split(x_dummies,Y,test_size=0.3,random_state=0)
print(len(x_train),len(y_train))
reg=LinearRegression()
model=reg.fit(x_train,y_train)
model.coef_
model.intercept_
y_pred=model.predict(x_test)
from sklearn.metrics import r2_score, mean_squared_error
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)
print("RMSE=",rmse)
print("R2 score=",r2)
#residual plot
x=[i for i in range(1,len(y_pred)+1)]
x_plot=plt.scatter(x,(y_pred - y_test),c='orange')
plt.plot(x,[0]*len(y_pred),c="blue")
plt.title('Residual plot')
predictors=x_train.columns
predictors
coef=pd.Series(reg.coef_, predictors).sort_values()
coef
coef.plot(kind="bar",title="Modal Coefficients")
"""
alpha is hyperparameter of Ridge, which means that they are not automatically learned by the model instead they have to be set
manually.
"""
#alpha_range=[0.01,0.05,0.5,5]
ridgeReg=Ridge(alpha=0.01, normalize=True)
model=ridgeReg.fit(x_train,y_train)
pred=model.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test, pred))
r2=r2_score(y_test, pred)
print("RMSE=", rmse, "R2=", r2)
df=pd.DataFrame(columns=['Linear Reg', "Ridge Reg"])
for i in range(len(ridgeReg.coef_)):
df=df.append({'Linear Reg': reg.coef_[i], "Ridge Reg":ridgeReg.coef_[i]},ignore_index=True)
df.head()
predictors=x_train.columns
coef=pd.Series(ridgeReg.coef_,predictors).sort_values()
coef.plot(kind='bar',title='Model Coefficients')
print(coef)
"""It tells about what which independent variables hold much significance"""
#alpha_range=[0.01,0.05,0.5,5]
lassoReg=Lasso(alpha=0.01, normalize=True)
model=lassoReg.fit(x_train,y_train)
pred=model.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test, pred))
r2=r2_score(y_test, pred)
print("RMSE=",rmse)
print("R-square=",r2)
co=lassoReg.coef_
inte=lassoReg.intercept_
df=pd.DataFrame(columns=['Linear Reg',"Ridge Reg", 'Lasso Reg'])
for i in range(len(ridgeReg.coef_)):
df=df.append({'Linear Reg': reg.coef_[i],"Ridge Reg":ridgeReg.coef_[i],"Lasso Reg":lassoReg.coef_[i]},ignore_index=True)
df.head()
print("RMSE=",rmse,"R2=",r2)
coef=pd.Series(lassoReg.coef_,predictors).sort_values()
print(coef)
coef.plot(kind="bar",title="Model Coefficients")
Item_Identifier,Item_Weight,Item_Fat_Content Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales. "The impact on outlet sales are caused by the positive values of coef---":
Item_Weight 2.232 Item_Type_Soft Drinks 6.674 Item_Type_Meat 8.607 Item_Type_Breads 14.587 Item_MRP 14.724 Item_Fat_Content 33.438 Item_Type_Fruits and Vegetables 56.863 Item_Type_Others 65.276 Item_Type_Starchy Foods 70.085 Outlet_Identifier_OUT045 97.045 Outlet_Identifier_OUT013 139.639 Outlet_Identifier_OUT046 220.273 Outlet_Identifier_OUT049 245.044 Outlet_Identifier_OUT017 258.164 Item_Type_Seafood 304.190 Outlet_Type 389.815 Outlet_Identifier_OUT035 393.441 Outlet_Identifier_OUT027 707.095
"Item_Weight,Item_MRP,Outlet_Identifier, Item_Type, Outlet_Type, Item_Fat_Content"- Independent Variables which cause significant impact on the sales of outlet.
test=pd.read_csv("D:/Assignment1_PGD B7/Test.csv")
test.head()
test.isnull().sum()
me=np.mean(test["Item_Weight"])
test["Item_Weight"].fillna(me,inplace=True)
test["Outlet_Size"].mode()
test["Outlet_Size"].fillna("Medium",inplace=True)
test["Outlet_Size"].head()
test.head()
test.shape
test.isnull().sum()
'''Using Get_Dummies Function for nominal Data and Label Encoding
for Ordinal Data'''
import pandas as pd
test1=test.drop("Item_Identifier",axis=1)
test1.head()
test_dummies=pd.get_dummies(test1[["Item_Type","Outlet_Identifier"]]) #Use of get_dummies instead of One-Hot Encoding
test_dummies.head()
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
le=LabelEncoder()
test1["Outlet_Size"]=le.fit_transform(test1["Outlet_Size"])
test1["Outlet_Location_Type"]=le.fit_transform(test1["Outlet_Location_Type"])
test1["Outlet_Type"]=le.fit_transform(test1["Outlet_Type"])
test1["Item_Fat_Content"]=le.fit_transform(test1["Item_Fat_Content"])
test_dummies=pd.get_dummies(test1)
test_dummies.head()
test_dummies.shape
Y1=Y[0:5681]
x_train,x_test,y_test,y_train=train_test_split(test_dummies,Y1,test_size=0.2,random_state=0)
len(y_train)
predicted_values=model.predict(test_dummies)
print(predicted_values)
"""Predicted Values are:"""